pr1 = read.csv("alldata_item1.csv")
pr1 <- as.data.table(pr1)
pr1 <- pr1[,-c("X","w_day")]
pr1 <- mutate(pr1, event_date = mdy(event_date)) # converting event date into datetime object
pr1[, Month:=as.numeric(lubridate::month(event_date,label=F))] #adding month information as a numeric variable
pr1[, Day:=as.numeric(lubridate::wday(event_date,label=F))] #adding day information as a numeric variable
head(pr1)
## price event_date product_content_id sold_count visit_count favored_count
## 1: 349.99 2021-06-18 48740784 3 151 4
## 2: -1.00 2021-06-17 48740784 0 153 12
## 3: 419.99 2021-06-16 48740784 5 320 17
## 4: 699.98 2021-06-15 48740784 4 286 16
## 5: 699.98 2021-06-14 48740784 5 344 25
## 6: 599.99 2021-06-13 48740784 0 197 13
## basket_count category_sold category_brand_sold category_visits ty_visits
## 1: 5 770 9 243772 91784941
## 2: 10 1039 6 223359 102409626
## 3: 19 1080 12 230842 105918459
## 4: 25 1067 12 226804 103541571
## 5: 22 965 14 227173 107738598
## 6: 10 771 2 228212 124336805
## category_basket category_favored Month weeknumber Day
## 1: 28289 17183 6 24 6
## 2: 6755 15156 6 24 5
## 3: 7089 15149 6 24 4
## 4: 6554 16328 6 24 3
## 5: 6656 16060 6 24 2
## 6: 6205 14628 6 23 1
Because there is a lot NA’s for sold amount of the product, I dropped these rows.
pr1 <- drop_na(pr1)
However, we are going to build arima models, we should have continuous time series. Therefore, I have to take only the last part of data set.
pr1 <- pr1[c(1:41),]
Now, we have very small data set.
For time series analysis, I only deal with the sold amount of the product so I drop possible regressors
sold <- data.table(event_date =pr1$event_date,
sold_count = pr1$sold_count)
head(sold)
## event_date sold_count
## 1: 2021-06-18 3
## 2: 2021-06-17 0
## 3: 2021-06-16 5
## 4: 2021-06-15 4
## 5: 2021-06-14 5
## 6: 2021-06-13 0
To detect outliers, first I draw boxplot
boxplot(sold$sold_count)
ggplot(sold, aes(x = event_date)) +
geom_line(aes(y = sold_count)) + ggtitle("Product 48740784 Sold Amount") +
xlab("Date") + ylab("Amount Sold")
#### ACF Plot
acf(sold$sold_count)
pacf(sold$sold_count)
Actually, we only have 41 data points and we onyl check weekly seasonality.
soldts <- ts(rev(pr1$sold_count), freq = 7, start= c(1,1))
resultweekdec <- decompose(soldts,type= "additive")
plot(resultweekdec)
plot(resultweekdec$random)
title("Random Term of Weekly Decomposed Data")
To built for this product, there is no chance to built a model using different seasonality. Therefore, I have to use weeekly seasonality.
random = resultweekdec$random
trend = resultweekdec$trend
season = resultweekdec$seasonal
acf(random, na.action = na.pass)
pacf(random, na.action = na.pass)
From acf and pacf plots, the random part is already stationary. We can use auto.arima function to find best model.
model <- auto.arima(random)
model
## Series: random
## ARIMA(1,0,0)(1,0,0)[7] with zero mean
##
## Coefficients:
## ar1 sar1
## -0.3775 -0.4356
## s.e. 0.1580 0.1691
##
## sigma^2 estimated as 0.9637: log likelihood=-48.8
## AIC=103.6 AICc=104.37 BIC=108.27
The best model was found by auto.arima with p=1 and P =1 SARIMA model with season frequency 7.
plot(model$residuals)
title("Residuals")
modelfit <- random - model$residuals
fitted <- modelfit+trend+season
plot(soldts)
points(fitted, type= "l", col = 2)
ggpairs(pr1 ,c(4,5,6,7,8,9,10,11,12,13 ))
Only reasonalbe corelation looks like Basket count.
ggpairs(pr1 ,c(4,7))
traindata <- sold[-c(1:7),]
head(traindata)
## event_date sold_count
## 1: 2021-06-11 0
## 2: 2021-06-10 2
## 3: 2021-06-09 0
## 4: 2021-06-08 2
## 5: 2021-06-07 2
## 6: 2021-06-06 3
testdata <- sold[c(1:7),]
head(testdata)
## event_date sold_count
## 1: 2021-06-18 3
## 2: 2021-06-17 0
## 3: 2021-06-16 5
## 4: 2021-06-15 4
## 5: 2021-06-14 5
## 6: 2021-06-13 0
regressor <- pr1$basket_count[-c(1:7)]
head(regressor)
## [1] 9 14 5 17 18 16
traindatats <- ts(rev(traindata$sold_count),frequency = 7, start = c(1,1))
resultdec <- decompose(traindatats,type= "additive")
trend = resultdec$trend
season = resultdec$seasonal
random = resultdec$random
plot(resultdec)
model <- auto.arima(random)
summary(model)
## Series: random
## ARIMA(0,0,1)(1,0,0)[7] with zero mean
##
## Coefficients:
## ma1 sar1
## -0.5736 -0.3373
## s.e. 0.1716 0.1982
##
## sigma^2 estimated as 0.6601: log likelihood=-33.51
## AIC=73.01 AICc=74.01 BIC=77.01
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.09203436 0.7829105 0.663417 94.84761 341.3919 0.5509734
## ACF1
## Training set 0.0950159
model <- auto.arima(random, xreg = regressor)
summary(model)
## Series: random
## Regression with ARIMA(0,0,1) errors
##
## Coefficients:
## ma1 xreg
## -0.5881 0.0022
## s.e. 0.1650 0.0053
##
## sigma^2 estimated as 0.7403: log likelihood=-34.69
## AIC=75.39 AICc=76.39 BIC=79.39
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.06286686 0.8290917 0.6856158 60.91982 360.5958 0.5694097
## ACF1
## Training set 0.1069631
Added regressor has some affect on the model, we continue with this model.
model <- auto.arima(random, xreg = regressor)
summary(model)
## Series: random
## Regression with ARIMA(0,0,1) errors
##
## Coefficients:
## ma1 xreg
## -0.5881 0.0022
## s.e. 0.1650 0.0053
##
## sigma^2 estimated as 0.7403: log likelihood=-34.69
## AIC=75.39 AICc=76.39 BIC=79.39
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.06286686 0.8290917 0.6856158 60.91982 360.5958 0.5694097
## ACF1
## Training set 0.1069631
model.forecast <- predict(model, n.ahead = 10, newxreg = rev(pr1$basket_count[c(1:10)]))$pred
last.trend.value <-tail(resultdec$trend[!is.na(resultdec$trend)],10)
seasonality <- resultdec$seasonal[22:31]
forecast_normalized <- model.forecast+last.trend.value+seasonality
forecast_normalized= ts(forecast_normalized, frequency = 7, start=c(5,4))
testdata <- ts(rev(testdata$sold_count), frequency = 7, start=c(5,4))
plot(testdata,ylim = c(0,10))
points(forecast_normalized,type= "l", col = 2)
modelfit <- random - model$residuals
plot(random)
points(modelfit, type= "l", col = 2)
Meaningful data for this product is starting from 23rd of January 2021. Therefore, we only have 4 month data set.
pr2 = read.csv("alldata_item2.csv")
pr2 <- as.data.table(pr2)
pr2 <- pr2[,-c("X","w_day")]
pr2 <- mutate(pr2, event_date = dmy(event_date)) # converting event date into datetime object
pr2[, Month:=as.numeric(lubridate::month(event_date,label=F))] #adding month information as a numeric variable
pr2[, Day:=as.numeric(lubridate::wday(event_date,label=F))] #adding day information as a numeric variable
head(pr2)
## price event_date product_content_id sold_count visit_count favored_count
## 1: 59.99 2021-06-18 73318567 46 7015 485
## 2: 61.56 2021-06-17 73318567 68 8220 559
## 3: 59.99 2021-06-16 73318567 72 8548 574
## 4: 59.99 2021-06-15 73318567 44 8585 531
## 5: 59.99 2021-06-14 73318567 67 10221 675
## 6: 61.16 2021-06-13 73318567 44 10354 759
## basket_count category_sold category_brand_sold category_visits ty_visits
## 1: 186 5860 3459 788525 91784941
## 2: 245 6726 3998 918671 102409626
## 3: 365 6044 4296 830677 105918459
## 4: 188 6007 4470 839150 103541571
## 5: 350 6707 5177 913292 107738598
## 6: 196 5980 3803 1132270 124336805
## category_basket category_favored Month weeknumber Day
## 1: 29529 66037 6 24 6
## 2: 33838 78801 6 24 5
## 3: 29629 68988 6 24 4
## 4: 29324 64562 6 24 3
## 5: 33397 72409 6 24 2
## 6: 36258 98006 6 23 1
To built a ARIMA model, I only neeed sold_count output variable.
sold <- data.table(event_date =pr2$event_date,
sold_count = pr2$sold_count)
head(sold)
## event_date sold_count
## 1: 2021-06-18 46
## 2: 2021-06-17 68
## 3: 2021-06-16 72
## 4: 2021-06-15 44
## 5: 2021-06-14 67
## 6: 2021-06-13 44
To detect outliers, first I draw boxplot
boxplot(sold$sold_count)
ggplot(sold, aes(x = event_date)) +
geom_line(aes(y = sold_count)) + ggtitle("Product 73318567 Sold Amount") +
xlab("Date") + ylab("Amount Sold")
Even if I only choose nonzero and non-NA sold counts while I am exporting data, sold_count sometimes still around 0. The reason behind it can be this product is a bikini. Therefore, in the winter times, we do nto aspect that item to sold. However, around end of February, reasonable number of item sold. This can be caused by a discount.
Now, let us check ACF and PACF plots.
acf(sold$sold_count)
It seems like we have some trend in the data. We can do differization but we will use decomposition of the data. Therefore no need to do differization.
pacf(sold$sold_count)
Actually, we only have 4 month of data points. To be able to decompose data with monthly periods, we should at least 12 month of data. Therefore, we only check weekly seasonality.
soldts <- ts(rev(pr2$sold_count), freq = 7, start= c(1,1))
resultweekdec <- decompose(soldts,type= "additive")
plot(resultweekdec)
While we doing additive decomposition, random term of it has too much variance. Therefore, for this product using multiplicative decomposition is better.
soldts <- ts(rev(pr2$sold_count), freq = 7, start= c(1,1))
resultweekdec <- decompose(soldts,type= "multiplicative")
plot(resultweekdec)
plot(resultweekdec$random)
title("Random Term of Weekly Decomposed Data")
Still, there is some outliers in the random term. However, we continue with this decomposed object.
random = resultweekdec$random
trend = resultweekdec$trend
season = resultweekdec$seasonal
acf(random, na.action = na.pass)
pacf(random, na.action = na.pass)
From ACF and PACF plots, we can use 3 auto correlation coefficient. However, auto.arima function can also find a better model.
model <- arima(random, order= c(3,0,0))
model
##
## Call:
## arima(x = random, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## -0.0140 -0.1657 -0.3469 0.9645
## s.e. 0.0919 0.0906 0.0923 0.0254
##
## sigma^2 estimated as 0.1523: log likelihood = -49.45, aic = 108.91
model <- arima(random, order= c(4,0,0))
model
##
## Call:
## arima(x = random, order = c(4, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.0310 -0.1470 -0.3469 0.1293 0.9647
## s.e. 0.0975 0.0907 0.0915 0.0998 0.0288
##
## sigma^2 estimated as 0.1497: log likelihood = -48.62, aic = 109.24
model <- arima(random, order= c(2,0,0))
model
##
## Call:
## arima(x = random, order = c(2, 0, 0))
##
## Coefficients:
## ar1 ar2 intercept
## 0.0495 -0.1785 0.9625
## s.e. 0.0966 0.0969 0.0365
##
## sigma^2 estimated as 0.1736: log likelihood = -56.01, aic = 120.02
model <- auto.arima(random)
model
## Series: random
## ARIMA(0,0,0)(0,0,1)[7] with non-zero mean
##
## Coefficients:
## sma1 mean
## -0.1684 0.9613
## s.e. 0.1078 0.0348
##
## sigma^2 estimated as 0.1787: log likelihood=-56.55
## AIC=119.11 AICc=119.35 BIC=127.01
From, those models we should use with 3 autoregressiove term because it has the smallest AIC value.
model <- arima(random, order= c(3,0,0))
summary(model)
##
## Call:
## arima(x = random, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## -0.0140 -0.1657 -0.3469 0.9645
## s.e. 0.0919 0.0906 0.0923 0.0254
##
## sigma^2 estimated as 0.1523: log likelihood = -49.45, aic = 108.91
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.003561498 0.3902301 0.2974092 -57.60445 78.47488 0.6933556
## ACF1
## Training set 0.0427066
plot(model$residuals)
title("Residuals")
modelfit <- random - model$residuals
fitted <- modelfit*trend*season
plot(soldts)
points(fitted, type= "l", col = 2)
Even that model can not catch the jumps of the data, it has a good fit looking. Now, we can improve the model by addinbg external regressors.
ggpairs(pr2, columns = c(4,7,8,10,12,13 ))
For this set, Basket Count and Category Basket can be used.
ggpairs(pr2, columns = c(4,5,6,9,11))
From this set, we can use Visit Count attribute.
Overall, we selected our regressors as Basket Count, Category Basket and Visit Count.
traindata <- sold[-c(1:7),]
head(traindata)
## event_date sold_count
## 1: 2021-06-11 44
## 2: 2021-06-10 56
## 3: 2021-06-09 36
## 4: 2021-06-08 59
## 5: 2021-06-07 53
## 6: 2021-06-06 23
testdata <- sold[c(1:7),]
head(testdata)
## event_date sold_count
## 1: 2021-06-18 46
## 2: 2021-06-17 68
## 3: 2021-06-16 72
## 4: 2021-06-15 44
## 5: 2021-06-14 67
## 6: 2021-06-13 44
regressors <- pr2[-c(1:7), c(5,7,12)]
head(regressors)
## visit_count basket_count category_basket
## 1: 6745 190 24678
## 2: 7314 284 25809
## 3: 6507 144 31428
## 4: 7556 252 32786
## 5: 8335 318 30789
## 6: 9567 160 40537
traindatats <- ts(rev(traindata$sold_count),frequency = 7, start = c(1,1))
resultdec <- decompose(traindatats,type= "multiplicative")
trend = resultdec$trend
season = resultdec$seasonal
random = resultdec$random
plot(resultdec)
model <- arima(random, order= c(3,0,0))
summary(model)
##
## Call:
## arima(x = random, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## -0.0035 -0.1648 -0.3587 0.9662
## s.e. 0.0949 0.0935 0.0959 0.0270
##
## sigma^2 estimated as 0.1599: log likelihood = -48.46, aic = 106.92
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.003936708 0.3998318 0.3069863 -60.88363 82.4063 0.6968886
## ACF1
## Training set 0.04938984
model <- arima(random, order= c(3,0,0), xreg = as.matrix(regressors))
summary(model)
##
## Call:
## arima(x = random, order = c(3, 0, 0), xreg = as.matrix(regressors))
##
## Coefficients:
## ar1 ar2 ar3 intercept visit_count basket_count
## -0.0109 -0.1628 -0.3754 0.9667 0 1e-04
## s.e. 0.0637 0.0160 0.0950 0.0336 NaN 1e-04
## category_basket
## 0
## s.e. NaN
##
## sigma^2 estimated as 0.1548: log likelihood = -46.92, aic = 109.85
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.004552634 0.3934125 0.3127563 -59.96818 82.56838 0.7099871
## ACF1
## Training set 0.04466163
model <- auto.arima(random, xreg = as.matrix(regressors))
summary(model)
## Series: random
## Regression with ARIMA(0,0,0)(1,0,0)[7] errors
##
## Coefficients:
## sar1 intercept visit_count basket_count category_basket
## -0.1719 0.9605 0e+00 0e+00 0
## s.e. 0.2491 0.0473 2e-04 3e-04 0
##
## sigma^2 estimated as 0.1914: log likelihood=-54.4
## AIC=120.81 AICc=121.75 BIC=136.19
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0002567742 0.425993 0.3287213 20.47873 55.05169 0.6884323
## ACF1
## Training set 0.09892385
When we add 3 regressors, none of them has the significancy in the model, for the last time I am going to try to build arima model with only one external regressor.
regressors <- pr2[-c(1:7), 7]
head(regressors)
## basket_count
## 1: 190
## 2: 284
## 3: 144
## 4: 252
## 5: 318
## 6: 160
model <- auto.arima(random, xreg = as.matrix(regressors))
summary(model)
## Series: random
## Regression with ARIMA(0,0,0)(0,0,1)[7] errors
##
## Coefficients:
## sma1 intercept basket_count
## -0.1719 0.9414 1e-04
## s.e. 0.1146 0.0537 2e-04
##
## sigma^2 estimated as 0.1903: log likelihood=-55.17
## AIC=118.33 AICc=118.77 BIC=128.59
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.001181051 0.4293904 0.3251242 21.46187 56.36737 0.680899
## ACF1
## Training set 0.09971225
model <- arima(random, order= c(3,0,0), xreg = regressors)
summary(model)
##
## Call:
## arima(x = random, order = c(3, 0, 0), xreg = regressors)
##
## Coefficients:
## ar1 ar2 ar3 intercept basket_count
## -0.0030 -0.1647 -0.3595 0.9431 1e-04
## s.e. 0.0952 0.0937 0.0959 0.0386 1e-04
##
## sigma^2 estimated as 0.1587: log likelihood = -48.11, aic = 108.22
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.003876478 0.3983771 0.3094548 -59.53279 81.36995 0.7024923
## ACF1
## Training set 0.04774178
External regressor does not affect the model, but we can hold it into the model and forecast with it.
model.forecast <- predict(model, n.ahead= 10, newxreg = pr2$basket_count[c(1:10)] )$pred
last.trend.value <- tail(resultdec$trend[!is.na(resultdec$trend)],1)
seasonality <- resultdec$seasonal[4:13]
forecast_normalized <- model.forecast*last.trend.value*seasonality
forecast_normalized= ts(forecast_normalized, frequency = 7, start=c(15,4))
testdata <- ts(rev(testdata$sold_count), frequency = 7, start=c(15,4))
plot(testdata, ylim = c(30,80))
points(forecast_normalized,type= "l", col = 2)
modelfit <- random - model$residuals
plot(random)
points(modelfit, type= "l", col = 2)
Meaningful data for this product is starting from 20th of February 2021. Therefore, we only have 119 data row.
pr3 = read.csv("alldata_item3.csv")
pr3 <- as.data.table(pr3)
pr3 <- pr3[,-c("X","w_day")]
pr3 <- pr3[c(1:119),]
pr3 <- mutate(pr3, event_date = dmy(event_date)) # converting event date into datetime object
pr3[, Month:=as.numeric(lubridate::month(event_date,label=F))] #adding month information as a numeric variable
pr3[, Day:=as.numeric(lubridate::wday(event_date,label=F))] #adding day information as a numeric variable
head(pr3)
## price event_date product_content_id sold_count visit_count favored_count
## 1: 59.99 2021-06-18 32737302 76 5222 676
## 2: 59.99 2021-06-17 32737302 102 8368 1541
## 3: 60.24 2021-06-16 32737302 104 5223 485
## 4: 61.83 2021-06-15 32737302 88 4957 411
## 5: 61.15 2021-06-14 32737302 88 5109 380
## 6: 62.11 2021-06-13 32737302 73 4842 426
## basket_count category_sold category_brand_sold category_visits ty_visits
## 1: 386 5860 3459 788525 91784941
## 2: 705 6726 3998 918671 102409626
## 3: 367 6044 4296 830677 105918459
## 4: 353 6007 4470 839150 103541571
## 5: 398 6707 5177 913292 107738598
## 6: 347 5980 3803 1132270 124336805
## category_basket category_favored Month weeknumber Day
## 1: 29529 66037 6 24 6
## 2: 33838 78801 6 24 5
## 3: 29629 68988 6 24 4
## 4: 29324 64562 6 24 3
## 5: 33397 72409 6 24 2
## 6: 36258 98006 6 23 1
To built a ARIMA model, I only need sold_count output variable.
sold <- data.table(event_date =pr3$event_date,
sold_count = pr3$sold_count)
head(sold)
## event_date sold_count
## 1: 2021-06-18 76
## 2: 2021-06-17 102
## 3: 2021-06-16 104
## 4: 2021-06-15 88
## 5: 2021-06-14 88
## 6: 2021-06-13 73
To detect outliers, first I draw boxplot:
boxplot(sold$sold_count)
ggplot(sold, aes(x = event_date)) +
geom_line(aes(y = sold_count)) + ggtitle("Product 32737302 Sold Amount") +
xlab("Date") + ylab("Amount Sold")
This product is also a bikini. Thereefore, we aspect naturally that sold count should have an icreased trend while we are getting closer to the summer days. However at March also the sales are high, it can be because of some discount
Now, let us check ACF and PACF plots.
acf(sold$sold_count)
From ACF plot, there can be a weekly seasonality in the data.
pacf(sold$sold_count)
PACF plot had a peak at lag 7. Maybe we can detect seasonality when we do weekly decomposition.
Actually, we only have 119 row of data. To be able to decompose data with monthly periods, we should at least 12 month of data. Therefore, we only check weekly seasonality.
soldts <- ts(rev(pr3$sold_count), freq = 7, start= c(1,1))
resultweekdec <- decompose(soldts,type= "additive")
plot(resultweekdec)
plot(resultweekdec$random)
title("Random Term of Weekly Decomposed Data")
Random term looks like, zero mean and constant variance. Therefore we continue with this random term to built our ARIMA and ARIMAX models.
random = resultweekdec$random
trend = resultweekdec$trend
season = resultweekdec$seasonal
acf(random, na.action = na.pass)
We can use 4 autoregressive model but we also need to use auto.arima function.
pacf(random, na.action = na.pass)
ACF and PACF models shows us that this random term need moving average terms.
model <- auto.arima(random)
model
## Series: random
## ARIMA(0,0,1) with zero mean
##
## Coefficients:
## ma1
## 0.3245
## s.e. 0.0879
##
## sigma^2 estimated as 100.8: log likelihood=-420.55
## AIC=845.1 AICc=845.21 BIC=850.55
model <- arima(random, order= c(0,0,2))
summary(model)
##
## Call:
## arima(x = random, order = c(0, 0, 2))
##
## Coefficients:
## ma1 ma2 intercept
## -0.3144 -0.6855 -0.034
## s.e. 0.0630 0.0600 0.043
##
## sigma^2 estimated as 82.03: log likelihood = -411.5, aic = 831.01
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.7763951 9.056836 6.524014 50.04219 164.1316 0.6178633 0.2161719
Better to go with two moving average term respect to the AIC values.
plot(model$residuals)
title("Residuals")
modelfit <- random - model$residuals
fitted <- modelfit+trend+season
plot(soldts)
points(fitted, type= "l", col = 2)
#### Comments
It has a good fit looking. Now, we can improve the model by adding external regressors.
ggpairs(pr3, columns = c(4,7,8,10,12,13 ))
For this set, Basket Count and Category Sold can be used.
ggpairs(pr3, columns = c(4,5,6,9,11))
For this set, we can use Visit Count.
Overall, we selected our regressors as Basket Count, Category Sold and Visit Count.
traindata <- sold[-c(1:7),]
head(traindata)
## event_date sold_count
## 1: 2021-06-11 37
## 2: 2021-06-10 49
## 3: 2021-06-09 41
## 4: 2021-06-08 46
## 5: 2021-06-07 44
## 6: 2021-06-06 50
testdata <- sold[c(1:7),]
head(testdata)
## event_date sold_count
## 1: 2021-06-18 76
## 2: 2021-06-17 102
## 3: 2021-06-16 104
## 4: 2021-06-15 88
## 5: 2021-06-14 88
## 6: 2021-06-13 73
regressors <- pr3[-c(1:7), c(5,7,8)]
head(regressors)
## visit_count basket_count category_sold
## 1: 2750 183 4436
## 2: 3080 217 4467
## 3: 3059 197 5013
## 4: 3266 202 5586
## 5: 3237 164 5390
## 6: 4239 293 5809
traindatats <- ts(rev(traindata$sold_count),frequency = 7, start = c(1,1))
resultdec <- decompose(traindatats,type= "additive")
trend = resultdec$trend
season = resultdec$seasonal
random = resultdec$random
plot(resultdec)
model <- arima(random, order= c(0,0,2))
summary(model)
##
## Call:
## arima(x = random, order = c(0, 0, 2))
##
## Coefficients:
## ma1 ma2 intercept
## -0.3304 -0.6696 0.0156
## s.e. 0.0647 0.0619 0.0459
##
## sigma^2 estimated as 78.95: log likelihood = -384.08, aic = 776.16
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.532943 8.885519 6.35625 36.1442 152.6354 0.6022914 0.1947125
model <- arima(random, order= c(0,0,2), xreg = regressors)
summary(model)
##
## Call:
## arima(x = random, order = c(0, 0, 2), xreg = regressors)
##
## Coefficients:
## ma1 ma2 intercept visit_count basket_count category_sold
## -0.3326 -0.6674 -0.2482 -1e-04 0.0030 0e+00
## s.e. 0.0650 0.0621 0.3123 6e-04 0.0106 2e-04
##
## sigma^2 estimated as 78.53: log likelihood = -383.8, aic = 781.59
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.3090294 8.86174 6.343932 44.42244 158.1834 0.6011242 0.1951389
Coefficients of earlier model has been changed, category sold attribute useless for the model. That’s probably because some regressors explains each other. I am going to drop the Category Sold column to try to fit the model again.
regressors <- pr3[-c(1:7), c(5,7)]
head(regressors)
## visit_count basket_count
## 1: 2750 183
## 2: 3080 217
## 3: 3059 197
## 4: 3266 202
## 5: 3237 164
## 6: 4239 293
model <- arima(random, order= c(0,0,2), xreg = regressors)
summary(model)
##
## Call:
## arima(x = random, order = c(0, 0, 2), xreg = regressors)
##
## Coefficients:
## ma1 ma2 intercept visit_count basket_count
## -0.3320 -0.668 -0.3639 0e+00 0.0018
## s.e. 0.0648 0.062 0.3283 5e-04 0.0098
##
## sigma^2 estimated as 78.59: log likelihood = -383.84, aic = 779.68
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.3696946 8.865366 6.31504 47.73777 152.563 0.5983865 0.194479
Now, we have better moving average and external regressor terms. We can continue with forecasting.
regmatrix <- pr3[c(1:10), c(5,7)]
model.forecast <- predict(model, n.ahead= 10, newxreg = regmatrix )$pred
last.trend.value <- tail(resultdec$trend[!is.na(resultdec$trend)],10)
seasonality <- resultdec$seasonal[96:105]
forecast_normalized <- model.forecast+last.trend.value+seasonality
forecast_normalized= ts(forecast_normalized, frequency = 7, start=c(16,4))
testdata <- ts(rev(testdata$sold_count), frequency = 7, start=c(16,4))
plot(testdata)
points(forecast_normalized,type= "l", col = 2)
modelfit <- random - model$residuals
plot(random)
points(modelfit, type= "l", col = 2)
title("Fitted vs Actual")
#### Conclusion
pr4 <- read_excel("item4.xlsx")
ggplot(data = pr4, aes(x = event_date, y = sold_count))+ geom_point()+ geom_line(aes(group=1))
testpr4 <- tail(pr4, 7)
trainpr4 <- head(pr4, -(7))
Looking at the daily sales graph, it can be initially stated that the data set does not have any obvious trend. To comment on seasonality, further investigation is needed since the data points are mostly congested around similar values.
The daily sales data for product 31515569 will be converted into weekly and monthly time series data. Then, each time series data will be decomposed to analyze the seasonality and trend properties.
weeklytspr4 <- ts(trainpr4$sold_count, freq = 7, start = c(1,1))
decweeklypr4 <- decompose(weeklytspr4, type = "additive")
plot(decweeklypr4)
tsdisplay(decweeklypr4$seasonal)
tsdisplay(decweeklypr4$random)
kpss.test(decweeklypr4$random)
##
## KPSS Test for Level Stationarity
##
## data: decweeklypr4$random
## KPSS Level = 0.0063763, Truncation lag parameter = 5, p-value = 0.1
The time series data for weekly level is constructed above. Since variance of the initial sales data almost does not change except for some sudden peaks, additive decomposition is applied. Looking at the plot of decomposed data, trend component does not only increase or decrease, hence no trend is observed. When the seasonal component is investigated, it is seen that the seasonal component repeats a certain path. Also, in the ACF plot of seasonal component, there is high autocorrelation between lags that are multiples of 7, hence there is weekly seasonality. Besides, random component turns out to be stationary when KPSS test is applied.
monthlytspr4 <- ts(trainpr4$sold_count, freq = 30, start = c(1,1))
decmonthlypr4 <- decompose(monthlytspr4, type = "additive")
plot(decmonthlypr4)
tsdisplay(decmonthlypr4$seasonal)
tsdisplay(decmonthlypr4$random)
kpss.test(decmonthlypr4$random)
##
## KPSS Test for Level Stationarity
##
## data: decmonthlypr4$random
## KPSS Level = 0.010258, Truncation lag parameter = 5, p-value = 0.1
The time series data for monthly level is constructed above. Since variance of the initial sales data almost does not change except for some sudden peaks, additive decomposition is applied. Looking at the plot of decomposed data, trend component does not only increase or decrease, hence no trend is observed. When the seasonal component is investigated, it is seen that the seasonal component repeats a certain path. Also, in the ACF plot of seasonal component, there is high autocorrelation between lags that are multiples of 30, hence there is monthly seasonality. Besides, random component turns out to be stationary when KPSS test is applied.
The random component of weekly time series decomposition will be used to construct an ARIMA model as it looks more random than the other candidate. Looking at the ACF and PACF plots of the random term, ARIMA(2,0,2) is chosen as the initial model. Then, selected model will be improved by doing neighborhood search.
randompr4 <- decweeklypr4$random
pr4arima <- arima(randompr4, order=c(2,0,2))
pr4arima
##
## Call:
## arima(x = randompr4, order = c(2, 0, 2))
##
## Coefficients:
## ar1 ar2 ma1 ma2 intercept
## 1.3560 -0.7219 -1.3689 0.3689 0.0759
## s.e. 0.0656 0.0480 0.0988 0.0986 0.3483
##
## sigma^2 estimated as 189608: log likelihood = -2873.9, aic = 5759.81
pr4ar1 <- arima(randompr4, order=c(1,0,2))
pr4ar1
##
## Call:
## arima(x = randompr4, order = c(1, 0, 2))
##
## Coefficients:
## ar1 ma1 ma2 intercept
## -0.1712 0.5596 0.1638 0.9479
## s.e. 0.2451 0.2368 0.0893 40.2192
##
## sigma^2 estimated as 286697: log likelihood = -2949.96, aic = 5909.91
pr4ar2 <- arima(randompr4, order=c(3,0,2))
pr4ar2 #lowest AIC
##
## Call:
## arima(x = randompr4, order = c(3, 0, 2))
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 intercept
## 1.8703 -1.3177 0.3135 -1.9487 0.9489 0.0762
## s.e. 0.0542 0.0895 0.0530 0.0213 0.0212 0.0874
##
## sigma^2 estimated as 174484: log likelihood = -2859.42, aic = 5732.84
pr4ar3 <- arima(randompr4, order=c(2,0,1))
pr4ar3
##
## Call:
## arima(x = randompr4, order = c(2, 0, 1))
##
## Coefficients:
## ar1 ar2 ma1 intercept
## 1.1117 -0.5481 -1.0000 0.0801
## s.e. 0.0426 0.0426 0.0066 0.4687
##
## sigma^2 estimated as 197248: log likelihood = -2881.13, aic = 5772.26
pr4ar4 <- arima(randompr4, order=c(2,0,3))
pr4ar4
##
## Call:
## arima(x = randompr4, order = c(2, 0, 3))
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 intercept
## 1.5142 -0.7286 -1.6301 0.3432 0.2870 0.0762
## s.e. 0.0398 0.0394 0.0545 0.1010 0.0513 0.0850
##
## sigma^2 estimated as 177443: log likelihood = -2863.02, aic = 5740.03
pr4model <- pr4ar2
After neighborhood search, the previously selected model is improved and the final model turns out to be ARIMA(3,0,2).
The plot of fitted model vs real values is shown below.
model_fitted_pr4 <- randompr4 - residuals(pr4model)
model_fitted_transformed_pr4 <- model_fitted_pr4+decweeklypr4$trend+decweeklypr4$seasonal
plot(weeklytspr4, xlab = "Weeks", ylab = "Sold Count",main="Actual (Black) vs. Predicted (Blue)")+points(model_fitted_transformed_pr4, type = "l", col = 5, lty = 1)
## integer(0)
To have an idea about the performance of the model, residuals of the model will be investigated.
checkresiduals(pr4model,lag=70)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,2) with non-zero mean
## Q* = 80.241, df = 64, p-value = 0.08268
##
## Model df: 6. Total lags used: 70
Looking at the residuals plot, it can be stated that zero mean and constant variance assumptions do not exactly hold. Besides, the ACF plot shows that residuals are not exactly random. As a result, the model is not adequate and can be improved by adding external regressors.
In this part, relation of sold count data with other regressors will be tested to see whether the model can be improved by using external regressors. Because of the dirty nature of the data, only “basket count”, “category sold”, “category visits”, and “category favored” columns will be tested using pairplots.
pr4data <- data.frame(date = trainpr4$event_date, sold_count = trainpr4$sold_count, basket_count = trainpr4$basket_count, category_visits = trainpr4$category_visits, category_favored = trainpr4$category_favored)
pairs(pr4data)
According to the pairplot, “basket count” and “category favored” columns are chosen to be added to the model as external regressors.
As chosen in the previous section, “basket count” and “category favored” data will be added to the model as external regressors.
regressorspr4 <- data.frame(trainpr4$basket_count, trainpr4$category_favored)
arimaxpr4 <- arima(randompr4, order = c(3,0,2), xreg = regressorspr4)
arimaxpr4
##
## Call:
## arima(x = randompr4, order = c(3, 0, 2), xreg = regressorspr4)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 intercept
## 1.8684 -1.3127 0.3071 -1.9689 0.9691 -1.1026
## s.e. 0.0523 0.0874 0.0517 0.0173 0.0172 0.2891
## trainpr4.basket_count trainpr4.category_favored
## 4e-04 0
## s.e. 1e-04 0
##
## sigma^2 estimated as 170319: log likelihood = -2855.28, aic = 5728.57
Looking at the significance values of the regressors, it can be said that the second regressor “category favored” has lost its significance. Therefore, the model can be built using the first regressor only.
arimaxpr4 <- arima(randompr4, order = c(3,0,2), xreg = pr4data$basket_count)
arimaxpr4
##
## Call:
## arima(x = randompr4, order = c(3, 0, 2), xreg = pr4data$basket_count)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 intercept pr4data$basket_count
## 1.8770 -1.3247 0.3143 -1.9702 0.9704 -1.3396 3e-04
## s.e. 0.0515 0.0865 0.0510 0.0132 0.0130 NaN NaN
##
## sigma^2 estimated as 171002: log likelihood = -2856.16, aic = 5728.31
After the addition of regressors, as a first impression, it can be observed that the AIC value falls, hence the model improves. To make a further investigation, fitted model vs real values graph can be plotted and residuals can be analyzed.
xregmodel_fitted_pr4 <- randompr4 - residuals(arimaxpr4)
xregmodel_fitted_transformed_pr4 <- xregmodel_fitted_pr4+decweeklypr4$trend+decweeklypr4$seasonal
plot(weeklytspr4, xlab = "Weeks", ylab = "Sold Count",main="Actual (Black) vs. Predicted (Blue)")+points(xregmodel_fitted_transformed_pr4, type = "l", col = 5, lty = 1)
## integer(0)
checkresiduals(arimaxpr4,lag=70)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,2) with non-zero mean
## Q* = 79.936, df = 63, p-value = 0.07357
##
## Model df: 7. Total lags used: 70
Compared to the ARIMA(3,0,2) model, which does not contain any external regressor, the model has just slightly improved with this ARIMAX model.
Predictions for the next 7 days are shown below.
model_forecast_pr4 = predict(arimaxpr4, n.ahead=7, newxreg = pr4data$basket_count[383:389])$pred
last_trend_pr4 = tail(decweeklypr4$trend[!is.na(decmonthlypr4$trend)], 1)
seasonality_pr4 = decweeklypr4$seasonal[5:11]
model_forecast_pr4 = model_forecast_pr4+last_trend_pr4+seasonality_pr4
forecastpr4 <- data.frame(date = testpr4$event_date, actual = testpr4$sold_count, predicted = model_forecast_pr4)
forecastpr4
## date actual predicted
## 1 2021-06-18 191 351.0687
## 2 2021-06-19 315 290.4191
## 3 2021-06-20 387 354.1438
## 4 2021-06-21 390 581.6685
## 5 2021-06-22 224 716.1490
## 6 2021-06-23 269 801.7160
## 7 2021-06-24 265 770.5171
To observe the performance of the model, actual test data vs predicted values graph will be plotted.
ggplot() +
geom_line(data = forecastpr4, aes(x = date, y = predicted,color = "predicted")) +
geom_line(data = forecastpr4, aes(x = date, y = actual,color = "actual")) +
xlab('Date') +
ylab('Sold Count')
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
As seen above, predictions are not so far from the actual data. However, it can be said that the last form of the model is not the best that can be built. To improve the model, outlier data points may be ignored and the model can be constructed again. Alternatively, LM models can be tried.
pr5 <- read_excel("item5.xlsx")
ggplot(data = pr5, aes(x = event_date, y = sold_count))+ geom_point()+ geom_line(aes(group=1))
testpr5 <- tail(pr5, 7)
trainpr5 <- head(pr5, -(7))
Looking at the daily sales graph, it can be initially stated that the data set does not have any obvious trend. To comment on seasonality, further investigation is needed.
The daily sales data for product 6676673 will be converted into weekly and monthly time series data. Then, each time series data will be decomposed to analyze the seasonality and trend properties.
weeklytspr5 <- ts(trainpr5$sold_count, freq = 7, start = c(1,1))
decweeklypr5 <- decompose(weeklytspr5, type = "additive")
plot(decweeklypr5)
tsdisplay(decweeklypr5$seasonal)
tsdisplay(decweeklypr5$random)
kpss.test(decweeklypr5$random)
##
## KPSS Test for Level Stationarity
##
## data: decweeklypr5$random
## KPSS Level = 0.0070808, Truncation lag parameter = 5, p-value = 0.1
The time series data for weekly level is constructed above. Since variance does not change significantly over time, additive decomposition is applied. Looking at the plot of decomposed data, trend component does not only increase or decrease, hence no trend is observed. When the seasonal component is investigated, it is seen that the seasonal component repeats a certain path. Also, in the ACF plot of seasonal component, there is high autocorrelation between lags that are multiples of 7, hence there is weekly seasonality. Besides, random component turns out to be stationary when KPSS test is applied.
monthlytspr5 <- ts(trainpr5$sold_count, freq = 30, start = c(1,1))
decmonthlypr5 <- decompose(monthlytspr5, type = "additive")
plot(decmonthlypr5)
tsdisplay(decmonthlypr5$seasonal)
tsdisplay(decmonthlypr5$random)
kpss.test(decmonthlypr5$random)
##
## KPSS Test for Level Stationarity
##
## data: decmonthlypr5$random
## KPSS Level = 0.011793, Truncation lag parameter = 5, p-value = 0.1
The time series data for monthly level is constructed above. Since variance variance does not change significantly over time, additive decomposition is applied. Looking at the plot of decomposed data, trend component does not only increase or decrease, hence no trend is observed. When the seasonal component is investigated, it is seen that the seasonal component repeats a certain path. Also, in the ACF plot of seasonal component, there is high autocorrelation between lags that are multiples of 30, hence there is monthly seasonality. Besides, random component turns out to be stationary when KPSS test is applied.
While investigating the weekly time series, it is realized that there is high autocorrelation between random terms in every three days. Hence, it is decided to build a three-daily time series data and use its random term while constructing an ARIMA model.
threetspr5 <- ts(trainpr5$sold_count, freq = 3, start = c(1,1))
decthreepr5 <- decompose(threetspr5, type = "additive")
plot(decthreepr5)
tsdisplay(decthreepr5$seasonal)
tsdisplay(decthreepr5$random)
kpss.test(decthreepr5$random)
##
## KPSS Test for Level Stationarity
##
## data: decthreepr5$random
## KPSS Level = 0.0099832, Truncation lag parameter = 5, p-value = 0.1
After the investigation of time series decompositions at different levels, random term of three-daily time series decomposition is chosen to build the model as it is more random than the other candidates. Looking at the ACF and PACF plots of the random term, ARIMA(0,0,2) is chosen as the initial model. Then, selected model will be improved by doing neighborhood search.
randompr5 <- decthreepr5$random
pr5arima <- arima(randompr5, order = c(0,0,2))
pr5arima
##
## Call:
## arima(x = randompr5, order = c(0, 0, 2))
##
## Coefficients:
## ma1 ma2 intercept
## -1.2393 0.2394 -0.0047
## s.e. 0.0567 0.0563 0.0184
##
## sigma^2 estimated as 2787: log likelihood = -2087.4, aic = 4182.8
pr5ar1 <- arima(randompr5, order=c(0,0,1))
pr5ar1
##
## Call:
## arima(x = randompr5, order = c(0, 0, 1))
##
## Coefficients:
## ma1 intercept
## -1.0000 -0.0036
## s.e. 0.0065 0.0247
##
## sigma^2 estimated as 2936: log likelihood = -2097.15, aic = 4200.29
pr5ar2 <- arima(randompr5, order=c(0,0,3))
pr5ar2
##
## Call:
## arima(x = randompr5, order = c(0, 0, 3))
##
## Coefficients:
## ma1 ma2 ma3 intercept
## -1.2725 0.1750 0.0975 -0.0049
## s.e. 0.0569 0.0717 0.0657 0.0153
##
## sigma^2 estimated as 2768: log likelihood = -2086.3, aic = 4182.6
pr5ar3 <- arima(randompr5, order=c(1,0,2))
pr5ar3 #lowest AIC
##
## Call:
## arima(x = randompr5, order = c(1, 0, 2))
##
## Coefficients:
## ar1 ma1 ma2 intercept
## 0.6638 -1.9599 0.9601 -0.0038
## s.e. 0.0482 0.0234 0.0233 0.0040
##
## sigma^2 estimated as 2532: log likelihood = -2070.49, aic = 4150.99
pr5model <- pr5ar3
After neighborhood search, the previously selected model is improved and the final model turns out to be ARIMA(1,0,2).
The plot of fitted model vs real values is shown below.
model_fitted_pr5 <- randompr5 - residuals(pr5model)
model_fitted_transformed_pr5 <- model_fitted_pr5+decthreepr5$trend+decthreepr5$seasonal
plot(threetspr5, xlab = "3-Days", ylab = "Sold Count",main="Actual (Black) vs. Predicted (Blue)")+points(model_fitted_transformed_pr5, type = "l", col = 5, lty = 1)
## integer(0)
To have an idea about the performance of the model, residuals of the model will be investigated.
checkresiduals(pr5model,lag=70)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with non-zero mean
## Q* = 53.643, df = 66, p-value = 0.8626
##
## Model df: 4. Total lags used: 70
Looking at the residuals plot, it can be stated that zero mean assumption does not exactly hold. To improve the current model, external regressors can be added and an ARIMAX model can be built.
In this part, relation of sold count data with other regressors will be tested to see whether the model can be improved by using external regressors. Because of the dirty nature of the data, only “basket count”, “category sold”, “category visits”, and “category favored” columns will be tested using pairplots.
pr5data <- data.frame(date = trainpr5$event_date, sold_count = trainpr5$sold_count, basket_count = trainpr5$basket_count, category_visits = trainpr5$category_visits, category_favored = trainpr5$category_favored)
pairs(pr5data)
According to the pairplot, “basket count” and “category favored” columns are chosen to be added to the model as external regressors.
As chosen in the previous section, “basket count” and “category favored” data will be added to the model as external regressors.
regressorspr5 <- data.frame(trainpr5$basket_count, trainpr5$category_favored)
arimaxpr5 <- arima(randompr5, order = c(1,0,2), xreg = regressorspr5)
arimaxpr5
##
## Call:
## arima(x = randompr5, order = c(1, 0, 2), xreg = regressorspr5)
##
## Coefficients:
## ar1 ma1 ma2 intercept trainpr5.basket_count
## 0.6552 -1.9687 0.9704 -0.2914 2e-04
## s.e. 0.0461 0.0227 0.0232 NaN 0e+00
## trainpr5.category_favored
## 0
## s.e. 0
##
## sigma^2 estimated as 2490: log likelihood = -2066.54, aic = 4147.08
Looking at the significance values of the regressors, it can be said that the second regressor “category favored” has lost its significance. Therefore, the model can be built using the first regressor only.
arimaxpr5 <- arima(randompr5, order = c(1,0,2), xreg = pr5data$basket_count)
arimaxpr5
##
## Call:
## arima(x = randompr5, order = c(1, 0, 2), xreg = pr5data$basket_count)
##
## Coefficients:
## ar1 ma1 ma2 intercept pr5data$basket_count
## 0.6839 -1.9895 0.9999 -1.3034 8e-04
## s.e. 0.0380 NaN NaN 0.1939 1e-04
##
## sigma^2 estimated as 2480: log likelihood = -2067.17, aic = 4146.33
After the addition of regressors, as a first impression, it can be observed that the AIC value falls, hence the model improves. To make a further investigation, fitted model vs real values graph can be plotted and residuals can be analyzed.
xregmodel_fitted_pr5 <- randompr5 - residuals(arimaxpr5)
xregmodel_fitted_transformed_pr5 <- xregmodel_fitted_pr5+decthreepr5$trend+decthreepr5$seasonal
plot(threetspr5, xlab = "3-Days", ylab = "Sold Count",main="Actual (Black) vs. Predicted (Blue)")+points(xregmodel_fitted_transformed_pr5, type = "l", col = 5, lty = 1)
## integer(0)
checkresiduals(arimaxpr5,lag=70)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with non-zero mean
## Q* = 54.528, df = 65, p-value = 0.8195
##
## Model df: 5. Total lags used: 70
Compared to the ARIMA(1,0,2) model, which does not contain any external regressor, the model has just slightly improved in this ARIMAX model.
Predictions for the next 7 days are shown below.
model_forecast_pr5 = predict(arimaxpr5, n.ahead=7, newxreg = pr5data$basket_count[383:389])$pred
last_trend_pr5 = tail(decthreepr5$trend[!is.na(decmonthlypr5$trend)], 1)
seasonality_pr5 = decthreepr5$seasonal[3:9]
model_forecast_pr5 = model_forecast_pr5+last_trend_pr5+seasonality_pr5
forecastpr5 <- data.frame(date = testpr5$event_date, actual = testpr5$sold_count, predicted = model_forecast_pr5)
forecastpr5
## date actual predicted
## 1 2021-06-18 620 495.8284
## 2 2021-06-19 614 506.1533
## 3 2021-06-20 658 498.3587
## 4 2021-06-21 455 497.1839
## 5 2021-06-22 597 507.2485
## 6 2021-06-23 425 499.2307
## 7 2021-06-24 274 497.3649
To observe the performance of the model, actual test data vs predicted values graph will be plotted.
ggplot() +
geom_line(data = forecastpr5, aes(x = date, y = predicted,color = "predicted")) +
geom_line(data = forecastpr5, aes(x = date, y = actual,color = "actual")) +
xlab('Date') +
ylab('Sold Count')
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
As seen above, predictions are not so far from the actual data. However, it can be said that the last form of the model is not the best that can be built. To improve the model, outlier data points may be ignored and the model can be constructed again. Alternatively, LM models can be tried.
pr6 <- read_excel("item6.xlsx")
ggplot(data = pr6, aes(x = event_date, y = sold_count))+ geom_point()+ geom_line(aes(group=1))
testpr6 <- tail(pr6, 7)
trainpr6 <- head(pr6, -(7))
Looking at the daily sales graph, it can be initially stated that the data set does not have any obvious trend. To comment on seasonality, further investigation is needed since the data points are mostly congested around similar values.
The daily sales data for product 7061886 will be converted into weekly and monthly time series data. Then, each time series data will be decomposed to analyze the seasonality and trend properties.
weeklytspr6 <- ts(trainpr6$sold_count, freq = 7, start = c(1,1))
decweeklypr6 <- decompose(weeklytspr6, type = "additive")
plot(decweeklypr6)
tsdisplay(decweeklypr6$seasonal)
tsdisplay(decweeklypr6$random)
kpss.test(decweeklypr6$random)
##
## KPSS Test for Level Stationarity
##
## data: decweeklypr6$random
## KPSS Level = 0.0063015, Truncation lag parameter = 5, p-value = 0.1
The time series data for weekly level is constructed above. Since variance does not change significantly over time, additive decomposition is applied. Looking at the plot of decomposed data, trend component does not only increase or decrease, hence no trend is observed. When the seasonal component is investigated, it is seen that the seasonal component repeats a certain path. Also, in the ACF plot of seasonal component, there is high autocorrelation between lags that are multiples of 7, hence there is weekly seasonality. Besides, random component turns out to be stationary when KPSS test is applied.
monthlytspr6 <- ts(trainpr6$sold_count, freq = 30, start = c(1,1))
decmonthlypr6 <- decompose(monthlytspr6, type = "additive")
plot(decmonthlypr6)
tsdisplay(decmonthlypr6$seasonal)
tsdisplay(decmonthlypr6$random)
kpss.test(decmonthlypr6$random)
##
## KPSS Test for Level Stationarity
##
## data: decmonthlypr6$random
## KPSS Level = 0.0085952, Truncation lag parameter = 5, p-value = 0.1
The time series data for monthly level is constructed above. Since variance variance does not change significantly over time, additive decomposition is applied. Looking at the plot of decomposed data, trend component does not only increase or decrease, hence no trend is observed. When the seasonal component is investigated, it is seen that the seasonal component repeats a certain path. Also, in the ACF plot of seasonal component, there is high autocorrelation between lags that are multiples of 30, hence there is monthly seasonality. Besides, random component turns out to be stationary when KPSS test is applied.
The random component of weekly time series decomposition will be used to construct an ARIMA model as it looks more random than the other candidate. Looking at the ACF and PACF plots of the random term, ARIMA(1,0,2) is chosen as the initial model. Then, selected model will be improved by doing neighborhood search.
randompr6 <- decweeklypr6$random
pr6arima <- arima(randompr6, order=c(1,0,2))
pr6arima
##
## Call:
## arima(x = randompr6, order = c(1, 0, 2))
##
## Coefficients:
## ar1 ma1 ma2 intercept
## -0.3053 0.5534 0.2596 0.0379
## s.e. 0.1639 0.1524 0.0582 1.3469
##
## sigma^2 estimated as 360.8: log likelihood = -1671.14, aic = 3352.27
pr6ar1 <- arima(randompr6, order=c(0,0,2))
pr6ar1
##
## Call:
## arima(x = randompr6, order = c(0, 0, 2))
##
## Coefficients:
## ma1 ma2 intercept
## 0.2776 0.2000 0.0443
## s.e. 0.0524 0.0664 1.4394
##
## sigma^2 estimated as 364.3: log likelihood = -1672.99, aic = 3353.97
pr6ar2 <- arima(randompr6, order=c(2,0,2))
pr6ar2
##
## Call:
## arima(x = randompr6, order = c(2, 0, 2))
##
## Coefficients:
## ar1 ar2 ma1 ma2 intercept
## 0.6811 -0.7508 -0.4353 0.6865 0.0325
## s.e. 0.0872 0.0824 0.0859 0.1079 1.1008
##
## sigma^2 estimated as 339.2: log likelihood = -1659.42, aic = 3330.84
pr6ar3 <- arima(randompr6, order=c(1,0,1))
pr6ar3
##
## Call:
## arima(x = randompr6, order = c(1, 0, 1))
##
## Coefficients:
## ar1 ma1 intercept
## 0.1358 0.0736 0.0482
## s.e. 0.1307 0.1241 1.2212
##
## sigma^2 estimated as 370.6: log likelihood = -1676.2, aic = 3360.39
pr6ar4 <- arima(randompr6, order=c(1,0,3))
pr6ar4 #lowest AIC
##
## Call:
## arima(x = randompr6, order = c(1, 0, 3))
##
## Coefficients:
## ar1 ma1 ma2 ma3 intercept
## 0.3564 -0.4482 -0.1511 -0.4007 0.0032
## s.e. 0.0728 0.0680 0.0634 0.0528 0.0227
##
## sigma^2 estimated as 268.2: log likelihood = -1616.7, aic = 3245.4
pr6model <- pr6ar4
After neighborhood search, the previously selected model is improved and the final model turns out to be ARIMA(1,0,3).
The plot of fitted model vs real values is shown below.
model_fitted_pr6 <- randompr6 - residuals(pr6model)
model_fitted_transformed_pr6 <- model_fitted_pr6+decweeklypr6$trend+decweeklypr6$seasonal
plot(weeklytspr6, xlab = "Weeks", ylab = "Sold Count",main="Actual (Black) vs. Predicted (Blue)")+points(model_fitted_transformed_pr6, type = "l", col = 5, lty = 1)
## integer(0)
To have an idea about the performance of the model, residuals of the model will be investigated.
checkresiduals(pr6model,lag=70)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,3) with non-zero mean
## Q* = 128.89, df = 65, p-value = 4.124e-06
##
## Model df: 5. Total lags used: 70
Looking at the residuals plot, it can be stated that constant variance assumption does not exactly hold. Besides, the ACF plot shows that residuals are a little bit correlated. To improve the current model, external regressors can be added and an ARIMAX model can be built.
In this part, relation of sold count data with other regressors will be tested to see whether the model can be improved by using external regressors. Because of the dirty nature of the data, only “basket count”, “category sold”, “category visits”, and “category favored” columns will be tested using pairplots.
pr6data <- data.frame(date = trainpr6$event_date, sold_count = trainpr6$sold_count, basket_count = trainpr6$basket_count, category_visits = trainpr6$category_visits, category_favored = trainpr6$category_favored)
pairs(pr6data)
According to the pairplot, “basket count” and “category favored” columns are chosen to be added to the model as external regressors.
As chosen in the previous section, “basket count” and “category favored” data will be added to the model as external regressors.
regressorspr6 <- data.frame(trainpr6$basket_count, trainpr6$category_favored)
arimaxpr6 <- arima(randompr6, order = c(1,0,3), xreg = regressorspr6)
arimaxpr6
##
## Call:
## arima(x = randompr6, order = c(1, 0, 3), xreg = regressorspr6)
##
## Coefficients:
## ar1 ma1 ma2 ma3 intercept trainpr6.basket_count
## 0.3549 -0.4500 -0.1501 -0.3999 -0.3934 0.0016
## s.e. 0.0731 0.0682 0.0635 0.0529 0.0582 0.0009
## trainpr6.category_favored
## 0
## s.e. NaN
##
## sigma^2 estimated as 266: log likelihood = -1615.1, aic = 3246.2
Looking at the significance values of the regressors, it can be said that the second regressor “category favored” has lost its significance. Therefore, the model can be built using the first regressor only.
arimaxpr6 <- arima(randompr6, order = c(1,0,3), xreg = pr6data$basket_count)
arimaxpr6
##
## Call:
## arima(x = randompr6, order = c(1, 0, 3), xreg = pr6data$basket_count)
##
## Coefficients:
## ar1 ma1 ma2 ma3 intercept pr6data$basket_count
## 0.3555 -0.4493 -0.1505 -0.4002 -0.2375 0.0016
## s.e. 0.0733 0.0683 0.0635 0.0529 0.1649 0.0011
##
## sigma^2 estimated as 266.7: log likelihood = -1615.59, aic = 3245.17
After the addition of regressors, as a first impression, it can be observed that the AIC value slightly falls, hence the model just improves. To make a further investigation, fitted model vs real values graph can be plotted and residuals can be analyzed.
xregmodel_fitted_pr6 <- randompr6 - residuals(arimaxpr6)
xregmodel_fitted_transformed_pr6 <- xregmodel_fitted_pr6+decweeklypr6$trend+decweeklypr6$seasonal
plot(weeklytspr6, xlab = "Weeks", ylab = "Sold Count",main="Actual (Black) vs. Predicted (Blue)")+points(xregmodel_fitted_transformed_pr6, type = "l", col = 5, lty = 1)
## integer(0)
checkresiduals(arimaxpr6,lag=70)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,3) with non-zero mean
## Q* = 128.1, df = 64, p-value = 3.521e-06
##
## Model df: 6. Total lags used: 70
Compared to the ARIMA(1,0,3) model, which does not contain any external regressor, autocorrelation of the residuals does not change much in this ARIMAX model.
Predictions for the next 7 days are shown below.
model_forecast_pr6 = predict(arimaxpr6, n.ahead=7, newxreg = pr6data$basket_count[383:389])$pred
last_trend_pr6 = tail(decweeklypr6$trend[!is.na(decweeklypr6$trend)], 1)
seasonality_pr6 = decweeklypr6$seasonal[5:11]
model_forecast_pr6 = model_forecast_pr6+last_trend_pr6+seasonality_pr6
forecastpr6 <- data.frame(date = testpr6$event_date, actual = testpr6$sold_count, predicted = model_forecast_pr6)
forecastpr6
## date actual predicted
## 1 2021-06-18 16 9.324846
## 2 2021-06-19 7 7.067309
## 3 2021-06-20 14 10.121678
## 4 2021-06-21 10 13.154488
## 5 2021-06-22 6 20.294680
## 6 2021-06-23 12 22.690423
## 7 2021-06-24 7 19.537203
To observe the performance of the model, actual test data vs predicted values graph will be plotted.
ggplot() +
geom_line(data = forecastpr6, aes(x = date, y = predicted,color = "predicted")) +
geom_line(data = forecastpr6, aes(x = date, y = actual,color = "actual")) +
xlab('Date') +
ylab('Sold Count')
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
As seen above, predictions are a bit off from the actual data. It can be said that the last form of the model is not the best that can be built. To improve the model, outlier data points may be ignored and the model can be constructed again. Alternatively, LM models can be tried.
pr7 = read.csv("alldata_item7.csv")
pr7 <- as.data.table(pr7)
pr7 <- pr7[,-c("X","w_day")]
pr7 <- mutate(pr7, event_date = ymd(event_date)) # converting event date into datetime object
pr7[, Month:=as.numeric(lubridate::month(event_date,label=F))] #adding month information as a numeric variable
pr7[, Day:=as.numeric(lubridate::wday(event_date,label=F))] #adding day information as a numeric variable
head(pr7)
## price event_date product_content_id sold_count visit_count favored_count
## 1: 87.14 2021-06-18 85004 57 2144 299
## 2: 86.78 2021-06-17 85004 79 2729 538
## 3: 87.42 2021-06-16 85004 81 2850 408
## 4: 88.08 2021-06-15 85004 78 2989 423
## 5: 89.20 2021-06-14 85004 47 2796 458
## 6: 86.40 2021-06-13 85004 95 4081 685
## basket_count category_sold category_brand_sold category_visits ty_visits
## 1: 271 2772 286 114848 91784941
## 2: 332 2927 350 117287 102409626
## 3: 336 3154 360 134400 105918459
## 4: 344 3504 389 137894 103541571
## 5: 298 3159 295 139775 107738598
## 6: 442 3895 417 166996 124336805
## category_basket category_favored Month weeknumber Day
## 1: 14440 15512 6 NA 6
## 2: 14701 15084 6 24 5
## 3: 16617 18419 6 24 4
## 4: 17365 19431 6 24 3
## 5: 17284 19610 6 24 2
## 6: 20152 23315 6 24 1
sold <- data.table(event_date =pr7$event_date,
sold_count = pr7$sold_count)
head(sold)
## event_date sold_count
## 1: 2021-06-18 57
## 2: 2021-06-17 79
## 3: 2021-06-16 81
## 4: 2021-06-15 78
## 5: 2021-06-14 47
## 6: 2021-06-13 95
boxplot(sold$sold_count)
- For initial observation, I plotted the sold_count data over time. At first sight, seasonality component seems to be exist but further examination was needed. So, I used ACF and PACF plots. From ACF, we see that there is a seasonality pattern in the data.
ggplot(sold, aes(x = event_date)) +
geom_line(aes(y = sold_count)) + ggtitle("Product 85004 Sold Amount") +
xlab("Date") + ylab("Amount Sold")
acf(sold$sold_count)
pacf(sold$sold_count)
soldts <- ts(rev(pr7$sold_count), freq = 7, start= c(1,1))
resultweekdec <- decompose(soldts,type= "multiplicative")
plot(resultweekdec)
soldtsmonth <- ts(rev(pr7$sold_count), freq = 30, start= c(1,1))
resultmonthdec <- decompose(soldtsmonth,type= "multiplicative")
plot(resultmonthdec)
plot(resultweekdec$random)
title("Random Term of Weekly Decomposed Data")
plot(resultmonthdec$random)
title("Random Term of Monthly Decomposed Data")
random = resultweekdec$random
trend = resultweekdec$trend
season = resultweekdec$seasonal
acf(random, na.action = na.pass)
pacf(random, na.action = na.pass)
model <- arima(random, order= c(1,0,0))
AIC(model)
## [1] 218.4069
model <- arima(random, order= c(2,0,0))
AIC(model)
## [1] 206.2024
model <- arima(random, order= c(3,0,0))
AIC(model)
## [1] 168.583
model <- arima(random, order= c(4,0,0))
AIC(model)
## [1] 166.5287
model <- arima(random, order= c(5,0,0))
AIC(model)
## [1] 158.1148
model <- arima(random, order= c(6,0,0))
AIC(model)
## [1] 154.5429
model <- arima(random, order= c(1,0,1))
AIC(model)
## [1] 216.7983
model <- arima(random, order= c(2,0,1))
AIC(model)
## [1] 160.3652
model <- arima(random, order= c(3,0,1))
AIC(model)
## [1] 152.2754
model <- arima(random, order= c(4,0,1))
AIC(model)
## [1] 153.7687
model <- arima(random, order= c(2,0,2))
AIC(model)
## [1] 152.8508
model <- arima(random, order= c(1,0,3))
AIC(model)
## [1] 161.2369
model <- arima(random, order= c(2,0,4))
AIC(model)
## [1] 136.4657
model <- arima(random, order= c(2,0,4))
summary(model)
##
## Call:
## arima(x = random, order = c(2, 0, 4))
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4 intercept
## 1.3497 -0.5302 -1.2977 0.2209 -0.0766 0.2773 0.9769
## s.e. 0.0812 0.0803 0.0858 0.1143 0.0850 0.0601 0.0098
##
## sigma^2 estimated as 0.07981: log likelihood = -60.23, aic = 136.47
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0001938992 0.2825043 0.2150155 -9.894616 25.71076 0.6971252
## ACF1
## Training set -0.0007301952
plot(model$residuals)
title("Residuals")
modelfit <- random - model$residuals
fitted <- modelfit*trend*season
plot(soldts)
points(fitted, type= "l", col = 2)
ggpairs(pr7, columns = c(4,7,8,10,12 ))
traindata <- sold[-c(1:7),]
head(traindata)
## event_date sold_count
## 1: 2021-06-11 87
## 2: 2021-06-10 108
## 3: 2021-06-09 73
## 4: 2021-06-08 64
## 5: 2021-06-07 68
## 6: 2021-06-06 88
testdata <- sold[c(1:7),]
head(testdata)
## event_date sold_count
## 1: 2021-06-18 57
## 2: 2021-06-17 79
## 3: 2021-06-16 81
## 4: 2021-06-15 78
## 5: 2021-06-14 47
## 6: 2021-06-13 95
regressors <- pr7[-c(1:7), c(7,13)]
head(regressors)
## basket_count category_favored
## 1: 577 26362
## 2: 571 21572
## 3: 363 19164
## 4: 322 19570
## 5: 298 21261
## 6: 498 25471
traindatats <- ts(rev(traindata$sold_count),frequency = 7, start = c(1,1))
resultdec <- decompose(traindatats,type= "multiplicative")
trend = resultdec$trend
season = resultdec$seasonal
random = resultdec$random
plot(resultdec)
model <- arima(random, order= c(2,0,4))
summary(model)
##
## Call:
## arima(x = random, order = c(2, 0, 4))
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4 intercept
## 1.3510 -0.5322 -1.2899 0.2086 -0.0715 0.2767 0.9760
## s.e. 0.0813 0.0811 0.0860 0.1147 0.0856 0.0606 0.0099
##
## sigma^2 estimated as 0.08008: log likelihood = -59.78, aic = 135.56
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0001895856 0.2829835 0.2150431 -9.928065 25.73001 0.6995601
## ACF1
## Training set -0.0005191347
model <- arima(random, order= c(2,0,4), xreg = regressors)
summary(model)
##
## Call:
## arima(x = random, order = c(2, 0, 4), xreg = regressors)
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4 intercept
## 1.3659 -0.5381 -1.3120 0.2175 -0.0651 0.2827 0.9549
## s.e. 0.0786 0.0764 0.0835 0.1128 0.0862 0.0606 0.0134
## basket_count category_favored
## 0e+00 0
## s.e. 1e-04 NaN
##
## sigma^2 estimated as 0.07951: log likelihood = -58.58, aic = 137.16
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 2.383106e-05 0.2819735 0.214914 -9.811676 25.69089 0.6991398
## ACF1
## Training set -0.0004002977
regressors <- regressors[,-c(2,2)]
model <- arima(random, order= c(3,0,1), xreg = regressors)
summary(model)
##
## Call:
## arima(x = random, order = c(3, 0, 1), xreg = regressors)
##
## Coefficients:
## ar1 ar2 ar3 ma1 intercept basket_count
## 0.7595 -0.2639 -0.1823 -0.6631 0.9793 0e+00
## s.e. 0.0797 0.0653 0.0587 0.0672 0.0151 1e-04
##
## sigma^2 estimated as 0.08446: log likelihood = -69.41, aic = 152.83
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0004108392 0.2906282 0.2204354 -10.25146 26.25426 0.7171019
## ACF1
## Training set 0.004760214
model <- arima(random, order= c(3,0,0), xreg = regressors)
summary(model)
##
## Call:
## arima(x = random, order = c(3, 0, 0), xreg = regressors)
##
## Coefficients:
## ar1 ar2 ar3 intercept basket_count
## 0.1865 -0.1192 -0.3115 0.9816 0e+00
## s.e. 0.0489 0.0494 0.0490 0.0230 1e-04
##
## sigma^2 estimated as 0.08865: log likelihood = -78.41, aic = 168.81
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0003283209 0.2977397 0.2289943 -10.51214 27.09697 0.7449448
## ACF1
## Training set -0.03190497
model.forecast <- predict(model, n.ahead = 10, newxreg = pr7$basket_count[c(1:10)])$pred
last.trend.value <-tail(resultdec$trend[!is.na(resultdec$trend)],10)
seasonality <- resultdec$seasonal[370:379]
forecast_normalized <- model.forecast*last.trend.value*seasonality
forecast_normalized= ts(forecast_normalized, frequency = 7, start=c(55,3))
testdata <- ts(rev(testdata$sold_count), frequency = 7, start=c(55,3))
plot(testdata,ylim = c(0,100))
points(forecast_normalized,type= "l", col = 2)
modelfit <- random - model$residuals
plot(random)
points(modelfit, type= "l", col = 2)
For the product 2, I followed the same steps of product 1. Therefore, I am going to skip the descriptive comments and keep the related ones only.
pr8 = read.csv("alldata_item8.csv")
pr8 <- as.data.table(pr8)
pr8 <- pr8[,-c("X","w_day")]
pr8 <- mutate(pr8, event_date = ymd(event_date)) # converting event date into datetime object
pr8[, Month:=as.numeric(lubridate::month(event_date,label=F))] #adding month information as a numeric variable
pr8[, Day:=as.numeric(lubridate::wday(event_date,label=F))] #adding day information as a numeric variable
head(pr8)
## price event_date product_content_id sold_count visit_count favored_count
## 1: 67.92 2021-06-18 4066298 2362 28495 1774
## 2: 72.50 2021-06-17 4066298 494 5535 344
## 3: 72.53 2021-06-16 4066298 383 4782 231
## 4: 72.56 2021-06-15 4066298 381 4325 226
## 5: 72.67 2021-06-14 4066298 269 3867 226
## 6: 72.54 2021-06-13 4066298 298 3806 226
## basket_count category_sold category_brand_sold category_visits ty_visits
## 1: 6862 10011 8616 140196 91784941
## 2: 1407 3575 1919 49818 102409626
## 3: 1140 3721 1822 53000 105918459
## 4: 880 3779 1819 50480 103541571
## 5: 773 3099 1355 47481 107738598
## 6: 776 2962 1344 44974 124336805
## category_basket category_favored Month weeknumber Day
## 1: 31630 11462 6 NA 6
## 2: 11647 3949 6 24 5
## 3: 13130 4131 6 24 4
## 4: 11154 4093 6 24 3
## 5: 10059 4170 6 24 2
## 6: 9868 4324 6 24 1
sold <- data.table(event_date =pr8$event_date,
sold_count = pr8$sold_count)
head(sold)
## event_date sold_count
## 1: 2021-06-18 2362
## 2: 2021-06-17 494
## 3: 2021-06-16 383
## 4: 2021-06-15 381
## 5: 2021-06-14 269
## 6: 2021-06-13 298
boxplot(sold$sold_count)
ggplot(sold, aes(x = event_date)) +
geom_line(aes(y = sold_count)) + ggtitle("Product 85004 Sold Amount") +
xlab("Date") + ylab("Amount Sold")
acf(sold$sold_count)
pacf(sold$sold_count)
soldts <- ts(rev(pr8$sold_count), freq = 7, start= c(1,1))
resultweekdec <- decompose(soldts,type= "additive")
plot(resultweekdec)
#### Monthly Seasonality
soldtsmonth <- ts(rev(pr8$sold_count), freq = 30, start= c(1,1))
resultmonthdec <- decompose(soldtsmonth,type= "additive")
plot(resultmonthdec)
plot(resultweekdec$random)
title("Random Term of Weekly Decomposed Data")
#### Random term after decomposing monthly
plot(resultmonthdec$random)
title("Random Term of Monthly Decomposed Data")
Both the monthly and the weekly decompositon seems to have a significant seasonal component but the trend variables are not significant. We might use weekly decompositon as the random term of weekly decomposition seems more like white noise series.
random = resultweekdec$random
trend = resultweekdec$trend
season = resultweekdec$seasonal
To decide on the parameters of the ARIMA model, I checked the ACF and PACF plots of the data. ACF plot has exponentially decaying and sinusodial pattern and PACF function has a sudden drop at lag 1 and no significant value after lag 1. So the model suggest a AR(1) model. But I am going to check the neighborhood to find the best model.
acf(random, na.action = na.pass)
pacf(random, na.action = na.pass)
model <- arima(random, order= c(1,0,0))
AIC(model)
## [1] 5320.309
model <- arima(random, order= c(2,0,0))
AIC(model)
## [1] 5296.147
model <- arima(random, order= c(3,0,0))
AIC(model)
## [1] 5262.312
model <- arima(random, order= c(4,0,0))
AIC(model)
## [1] 5262.86
model <- arima(random, order= c(1,0,1))
AIC(model)
## [1] 5314.948
model <- arima(random, order= c(2,0,1))
AIC(model)
## [1] 5179.014
model <- arima(random, order= c(3,0,1))
AIC(model)
## [1] 5177.287
model <- arima(random, order= c(4,0,1))
AIC(model)
## [1] 5179.094
model <- arima(random, order= c(3,0,2))
AIC(model)
## [1] 5175.331
model <- arima(random, order= c(2,0,2))
AIC(model)
## [1] 5177.047
model <- arima(random, order= c(1,0,2))
AIC(model)
## [1] 5218.631
So the best model we came up with is the one with (3,0,2) three autoregressive term and two moving average term.
model <- arima(random, order= c(3,0,2))
summary(model)
##
## Call:
## arima(x = random, order = c(3, 0, 2))
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 intercept
## 0.2317 0.2973 -0.4308 -0.2916 -0.7084 -0.0006
## s.e. 0.1592 0.1440 0.0716 0.1670 0.1669 0.1733
##
## sigma^2 estimated as 39651: log likelihood = -2580.67, aic = 5175.33
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.8829892 199.1267 111.8047 90.55069 209.8117 0.7018207
## ACF1
## Training set 0.0009035351
plot(model$residuals)
title("Residuals")
modelfit <- random - model$residuals
fitted <- modelfit+trend+season
plot(soldts)
points(fitted, type= "l", col = 2)
The model seems to have a good fit since zero mean assumption and constant varinace assumption seems to be held but it can be improved adding extra regressors to the model.
In this part, I am going to look for variables that might be correlated to sold amount and can be used as regressors in the ARIMAX model. To do this, first I am going to use pairplots. But some of the data in the columns are lost for so many data points. So only columns I can use for this, “Basket Count” , “Category Sold” , Category Visits" and “Category Favored”. I checked the relation of each variable with the sold count. From the visualizations, Basket Count, Category Favored and Category Sold seems to have a correlation, so I am going to add those variables to the model.
ggpairs(pr8, columns = c(4,7,8,10,13 ))
traindata <- sold[-c(1:7),]
head(traindata)
## event_date sold_count
## 1: 2021-06-11 230
## 2: 2021-06-10 253
## 3: 2021-06-09 244
## 4: 2021-06-08 249
## 5: 2021-06-07 266
## 6: 2021-06-06 448
testdata <- sold[c(1:7),]
head(testdata)
## event_date sold_count
## 1: 2021-06-18 2362
## 2: 2021-06-17 494
## 3: 2021-06-16 383
## 4: 2021-06-15 381
## 5: 2021-06-14 269
## 6: 2021-06-13 298
regressors <- pr8[-c(1:7), c(7,8,13)]
head(regressors)
## basket_count category_sold category_favored
## 1: 631 2404 3054
## 2: 723 2670 3446
## 3: 717 2960 3679
## 4: 691 2846 4665
## 5: 657 2863 3921
## 6: 1172 3691 4918
traindatats <- ts(rev(traindata$sold_count),frequency = 7, start = c(1,1))
resultdec <- decompose(traindatats,type= "additive")
trend = resultdec$trend
season = resultdec$seasonal
random = resultdec$random
plot(resultdec)
#### Without regressors
model <- arima(random, order= c(3,0,2))
summary(model)
##
## Call:
## arima(x = random, order = c(3, 0, 2))
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 intercept
## 0.2327 0.2969 -0.4294 -0.2942 -0.7058 0.0303
## s.e. 0.1598 0.1444 0.0718 0.1676 0.1675 0.1791
##
## sigma^2 estimated as 40073: log likelihood = -2535.66, aic = 5085.32
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.3927706 200.1829 112.7018 133.16 193.5605 0.6987689 0.0005841453
model <- arima(random, order= c(3,0,2), xreg = regressors)
summary(model)
##
## Call:
## arima(x = random, order = c(3, 0, 2), xreg = regressors)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 intercept basket_count
## 0.2330 0.2964 -0.4292 -0.2948 -0.7052 0.9434 -0.0003
## s.e. 0.1605 0.1450 0.0720 0.1683 0.1682 0.5816 0.0022
## category_sold category_favored
## -0.0008 1e-04
## s.e. 0.0015 2e-04
##
## sigma^2 estimated as 40061: log likelihood = -2535.6, aic = 5091.2
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1.009226 200.1518 112.6371 134.4533 194.0397 0.6983675
## ACF1
## Training set 0.0004937449
First AR term has lost its significancy . However I am going to forecast with this model.
newregmatrix <- pr8[c(1:10), c(7,8,13)]
model.forecast <- predict(model, n.ahead = 10, newxreg = newregmatrix)$pred
last.trend.value <-tail(resultdec$trend[!is.na(resultdec$trend)],1)
seasonality <- resultdec$seasonal[370:379]
forecast_normalized <- model.forecast+last.trend.value+seasonality
forecast_normalized= ts(forecast_normalized, frequency = 7, start=c(55,3))
testdata <- ts(rev(testdata$sold_count), frequency = 7, start=c(55,3))
plot(testdata)
points(forecast_normalized,type= "l", col = 2)
modelfit <- random - model$residuals
plot(random)
points(modelfit, type= "l", col = 2)
plot(model$residuals)
For the product 3, I followed the same steps of product 1. Therefore, I am going to skip the descriptive comments and keep the related ones only.
pr9 = read.csv("alldata_item9.csv")
pr9 <- as.data.table(pr9)
pr9 <- pr9[,-c("X","w_day")]
pr9 <- mutate(pr9, event_date = ymd(event_date)) # converting event date into datetime object
pr9[, Month:=as.numeric(lubridate::month(event_date,label=F))] #adding month information as a numeric variable
pr9[, Day:=as.numeric(lubridate::wday(event_date,label=F))] #adding day information as a numeric variable
head(pr9)
## price event_date product_content_id sold_count visit_count favored_count
## 1: 146.22 2021-06-18 32939029 108 3400 369
## 2: 151.76 2021-06-17 32939029 135 3588 367
## 3: 154.62 2021-06-16 32939029 169 5338 616
## 4: 140.50 2021-06-15 32939029 265 5643 632
## 5: 140.86 2021-06-14 32939029 245 5633 755
## 6: 155.68 2021-06-13 32939029 115 3994 573
## basket_count category_sold category_brand_sold category_visits ty_visits
## 1: 366 638 604 28504 91784941
## 2: 381 805 766 33290 102409626
## 3: 631 977 924 34081 105918459
## 4: 873 1016 983 33020 103541571
## 5: 822 959 922 34965 107738598
## 6: 472 778 742 33715 124336805
## category_basket category_favored Month weeknumber Day
## 1: 2368 3375 6 NA 6
## 2: 2639 4393 6 24 5
## 3: 3425 3168 6 24 4
## 4: 3421 3108 6 24 3
## 5: 3481 3900 6 24 2
## 6: 2944 3564 6 24 1
sold <- data.table(event_date =pr9$event_date,
sold_count = pr9$sold_count)
head(sold)
## event_date sold_count
## 1: 2021-06-18 108
## 2: 2021-06-17 135
## 3: 2021-06-16 169
## 4: 2021-06-15 265
## 5: 2021-06-14 245
## 6: 2021-06-13 115
boxplot(sold$sold_count)
ggplot(sold, aes(x = event_date)) +
geom_line(aes(y = sold_count)) + ggtitle("Product 85004 Sold Amount") +
xlab("Date") + ylab("Amount Sold")
acf(sold$sold_count)
pacf(sold$sold_count)
soldts <- ts(rev(pr9$sold_count), freq = 7, start= c(1,1))
resultweekdec <- decompose(soldts,type= "additive")
plot(resultweekdec)
soldtsmonth <- ts(rev(pr9$sold_count), freq = 30, start= c(1,1))
resultmonthdec <- decompose(soldtsmonth,type= "additive")
plot(resultmonthdec)
plot(resultweekdec$random)
title("Random Term of Weekly Decomposed Data")
plot(resultmonthdec$random)
title("Random Term of Monthly Decomposed Data")
random = resultweekdec$random
trend = resultweekdec$trend
season = resultweekdec$seasonal
acf(random, na.action = na.pass)
pacf(random, na.action = na.pass)
model <- arima(random, order= c(1,0,0))
AIC(model)
## [1] 3826.368
model <- arima(random, order= c(2,0,0))
AIC(model)
## [1] 3780.466
model <- arima(random, order= c(3,0,0))
AIC(model)
## [1] 3758.944
model <- arima(random, order= c(3,0,1))
AIC(model)
## [1] 3671.828
model <- arima(random, order= c(3,0,2))
AIC(model)
## [1] 3672.787
model <- arima(random, order= c(4,0,1))
AIC(model)
## [1] 3673.58
model <- arima(random, order= c(2,0,2))
AIC(model)
## [1] 3671.823
model <- arima(random, order= c(1,0,3))
AIC(model)
## [1] 3680.439
model <- arima(random, order= c(1,0,2))
AIC(model)
## [1] 3707.888
model <- arima(random, order= c(0,0,1))
AIC(model)
## [1] 3810.928
model <- arima(random, order= c(0,0,2))
AIC(model)
## [1] 3757.853
model <- arima(random, order= c(4,0,2))
AIC(model)
## [1] 3674.416
model <- arima(random, order= c(2,0,1))
AIC(model)
## [1] 3669.87
So the best model we came up with is the one with (2,0,1) two autoregressive term and two moving average term.
model <- arima(random, order= c(2,0,1))
summary(model)
##
## Call:
## arima(x = random, order = c(2, 0, 1))
##
## Coefficients:
## ar1 ar2 ma1 intercept
## 0.9769 -0.5113 -1.0000 -0.0016
## s.e. 0.0442 0.0442 0.0069 0.0242
##
## sigma^2 estimated as 794.6: log likelihood = -1829.94, aic = 3669.87
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.4899817 28.18822 19.48535 59.99377 176.7597 0.6895342
## ACF1
## Training set -0.005849612
plot(model$residuals)
title("Residuals")
modelfit <- random - model$residuals
fitted <- modelfit+trend+season
plot(soldts)
points(fitted, type= "l", col = 2)
ggpairs(pr9, columns = c(4,7,8,10,13 ))
traindata <- sold[-c(1:7),]
head(traindata)
## event_date sold_count
## 1: 2021-06-11 76
## 2: 2021-06-10 100
## 3: 2021-06-09 84
## 4: 2021-06-08 133
## 5: 2021-06-07 116
## 6: 2021-06-06 127
testdata <- sold[c(1:7),]
head(testdata)
## event_date sold_count
## 1: 2021-06-18 108
## 2: 2021-06-17 135
## 3: 2021-06-16 169
## 4: 2021-06-15 265
## 5: 2021-06-14 245
## 6: 2021-06-13 115
regressors <- pr9$basket_count[-c(1:7)]
head(regressors)
## [1] 267 384 406 523 494 564
traindatats <- ts(rev(traindata$sold_count),frequency = 7, start = c(1,1))
resultdec <- decompose(traindatats,type= "additive")
trend = resultdec$trend
season = resultdec$seasonal
random = resultdec$random
plot(resultdec)
model <- arima(random, order= c(2,0,1))
summary(model)
##
## Call:
## arima(x = random, order = c(2, 0, 1))
##
## Coefficients:
## ar1 ar2 ma1 intercept
## 0.9698 -0.4988 -1.0000 0.0025
## s.e. 0.0445 0.0445 0.0073 0.0250
##
## sigma^2 estimated as 783.9: log likelihood = -1794.04, aic = 3598.07
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.5702988 27.99792 19.18569 153.6635 246.4155 0.6824581
## ACF1
## Training set -0.003645417
model <- arima(random, order= c(2,0,1), xreg = regressors)
summary(model)
##
## Call:
## arima(x = random, order = c(2, 0, 1), xreg = regressors)
##
## Coefficients:
## ar1 ar2 ma1 intercept regressors
## 0.9689 -0.4994 -1.0000 0.0485 -1e-04
## s.e. 0.0445 0.0444 0.0071 0.0663 2e-04
##
## sigma^2 estimated as 782.6: log likelihood = -1793.72, aic = 3599.45
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.08742238 27.97421 19.13569 147.9856 240.8208 0.6806798
## ACF1
## Training set -0.003822443
Coefficients of earlier model has not been changed, but the basket count variable seems insignificant. So I am going to continue forecasting without the external regressors.
model.forecast <- predict(model, n.ahead = 10, newxreg = pr9$basket_count[c(1:10)])$pred
last.trend.value <-tail(resultdec$trend[!is.na(resultdec$trend)],10)
seasonality <- resultdec$seasonal[367:376]
forecast_normalized <- model.forecast+last.trend.value+seasonality
forecast_normalized= ts(forecast_normalized, frequency = 7, start=c(55,3))
testdata <- ts(rev(testdata$sold_count), frequency = 7, start=c(55,3))
plot(testdata)
points(forecast_normalized,type= "l", col = 2)
modelfit <- random - model$residuals
fitted <- modelfit+trend+season
plot(soldts)
points(fitted, type= "l", col = 2)
title("Fitted vs Real Values")
Overall looking the model, zero mean and independency of the residuals assumptions seem to be held by looking at the residuals but the increasing variance is kind of a problem. For this product, I might use ARIMAX model rather than LM model. But the at some points, peaks should be excluded.
Comments